import numpy as np
import pandas as pd
import random as rd
import string
personal_information = pd.read_csv('personal_information.csv')
pred_food = pd.read_csv('pred_food.csv')
personal_information.head()
| user_id | name | gender | age | consumption_status | sleeping_status | height_cm | weight_kg | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1 | znpdujka | M | 51 | 3 | 1 | 178.57 | 66.28 |
| 1 | 2 | dciqezrl | M | 61 | 2 | 3 | 182.36 | 88.97 |
| 2 | 3 | jlkbynux | F | 42 | 3 | 2 | 170.24 | 69.74 |
| 3 | 4 | runpybnd | F | 23 | 1 | 4 | 158.38 | 60.75 |
| 4 | 5 | kquicatc | F | 34 | 3 | 2 | 166.09 | 71.10 |
pred_food.head()
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | user_id | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Quesadilla | apples | bagels | chips | haiku roll | cupcakes | hamburger (and cheeseburgers and bacon cheeseb... | halibut | baked beans | Reuben | ... | franks | coffee | Irish stew | cake | gumbo | crab | broccoli | bluefish | Lamb | 1 |
| 1 | jerky | celery | clams | catfish | cake | bison | kingfish | Spinach | gumbo | bacon | ... | Wine | asparagus | barley | English muffins | artichoke | halibut | Irish stew | Milk | applesauce | 2 |
| 2 | donuts | Pepperoni | granola | applesauce | dates | jerky | English muffins | goose | apples | Venison | ... | BBQ | kale | Noodles | eel sushi | baked beans | broccoli | halibut | Avocado roll | hash browns | 3 |
| 3 | eggs | Reuben | Tater tots | Ziti | Wine | jambalaya | curry | Venison | dips | Spinach | ... | Garlic | Pancakes | baked beans | Yogurt | Toast | French toast | grits | falafel | Quesadilla | 4 |
| 4 | avacado | cookies | almond | English muffins | cake | Ziti | honey | carrots | Bruscetta | Wine | ... | gumbo | Apple juice | French dip | goose | Quesadilla | broccoli | apples | bagels | catfish | 5 |
5 rows Ć 31 columns
personal_information.merge(pred_food)
| user_id | name | gender | age | consumption_status | sleeping_status | height_cm | weight_kg | 0 | 1 | ... | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | znpdujka | M | 51 | 3 | 1 | 178.57 | 66.28 | jelly / jam | cheese | ... | coffee | jalapeƱo | Pepperoni | BBQ | buritto | arugala | Porter | Spinach | bluefish | Italian bread |
| 1 | 2 | dciqezrl | M | 61 | 2 | 3 | 182.36 | 88.97 | Spaghetti | chocolate | ... | ice cream | carrots | bluefish | fajita | almond | Ostrich | Quiche | gnocchi | cheese | halibut |
| 2 | 3 | jlkbynux | F | 42 | 3 | 2 | 170.24 | 69.74 | Toast | apples | ... | kingfish | French dip | Noodles | bacon | haiku roll | chimichanga | carne asada | bread | Ostrich | bison |
| 3 | 4 | runpybnd | F | 23 | 1 | 4 | 158.38 | 60.75 | alfalfa | honey | ... | chicken | bluefish | Quiche | BBQ | apples | kiwi | bread | clams | halibut | barley |
| 4 | 5 | kquicatc | F | 34 | 3 | 2 | 166.09 | 71.10 | chicken | ketchup | ... | ice cream | hash browns | broccoli | carne asada | Spaghetti | artichoke | beer | hamburger (and cheeseburgers and bacon cheeseb... | Garlic | chocolate |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 995 | 996 | upewyige | M | 69 | 1 | 4 | 185.94 | 68.28 | Spaghetti | asparagus | ... | Garlic | Spinach | Porter | Lamb | arugala | granola | Cabbage | Walnuts | Quesadilla | applesauce |
| 996 | 997 | hiqiobms | M | 69 | 3 | 3 | 167.66 | 83.55 | kidney beans | fondu | ... | Meatballs | carne asada | Walnuts | alfalfa | babaganoosh | Avocado roll | curry | chocolate | crab | black beans |
| 997 | 998 | fduqivxb | M | 16 | 3 | 3 | 175.71 | 62.74 | buritto | fondu | ... | French toast | Ostrich | Irish stew | curry | Lasagna | dips | Porter | corn | duck | cereal |
| 998 | 999 | zsjmhybd | M | 77 | 3 | 2 | 186.75 | 86.11 | coffee | haiku roll | ... | Spaghetti | lobster | broccoli | celery | Ziti | Venison | cheese | halibut | gumbo | Milk |
| 999 | 1000 | mqoldlhw | M | 2 | 1 | 4 | 81.81 | 13.28 | kabobs | Spaghetti | ... | Quesadilla | buritto | cereal | Yogurt | Moose | goose | dates | Ostrich | almond | Garlic |
1000 rows Ć 38 columns
personal_information.to_csv('personal_information.csv', index=False)
pred_food.to_csv('pred_food.csv',index=False)
df = pd.read_csv("/Users/peiyicai/Desktop/personal_information.csv")
df.describe()
| user_id | age | consumption_status | sleeping_status | height_cm | weight_kg | |
|---|---|---|---|---|---|---|
| count | 1000.000000 | 1000.0000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 |
| mean | 500.500000 | 41.1140 | 2.001000 | 2.977000 | 164.322240 | 64.822230 |
| std | 288.819436 | 22.8999 | 0.820776 | 1.441023 | 23.568473 | 21.551041 |
| min | 1.000000 | 1.0000 | 1.000000 | 1.000000 | 76.370000 | 12.450000 |
| 25% | 250.750000 | 22.0000 | 1.000000 | 2.000000 | 160.992500 | 55.670000 |
| 50% | 500.500000 | 42.0000 | 2.000000 | 3.000000 | 170.120000 | 68.130000 |
| 75% | 750.250000 | 61.0000 | 3.000000 | 4.000000 | 177.770000 | 78.692500 |
| max | 1000.000000 | 80.0000 | 3.000000 | 5.000000 | 190.000000 | 99.890000 |
df.nunique()
user_id 1000 name 1000 gender 2 age 80 consumption_status 3 sleeping_status 5 height_cm 883 weight_kg 924 dtype: int64
df.drop(columns='name', inplace= True)
df.head(5)
| user_id | gender | age | consumption_status | sleeping_status | height_cm | weight_kg | |
|---|---|---|---|---|---|---|---|
| 0 | 1 | M | 51 | 3 | 1 | 178.57 | 66.28 |
| 1 | 2 | M | 61 | 2 | 3 | 182.36 | 88.97 |
| 2 | 3 | F | 42 | 3 | 2 | 170.24 | 69.74 |
| 3 | 4 | F | 23 | 1 | 4 | 158.38 | 60.75 |
| 4 | 5 | F | 34 | 3 | 2 | 166.09 | 71.10 |
df.columns
Index(['user_id', 'gender', 'age', 'consumption_status', 'sleeping_status',
'height_cm', 'weight_kg'],
dtype='object')
df.isnull().sum()
user_id 0 gender 0 age 0 consumption_status 0 sleeping_status 0 height_cm 0 weight_kg 0 dtype: int64
(1) Label encoding the categorical features
(2) Scaling the features using the standard scaler
(3) Creating a subset dataframe for dimensionality reduction(PCA)
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
Label encoding:
label_encoder = preprocessing.LabelEncoder()
df['gender'] = label_encoder.fit_transform(df['gender'])
df.head()
| user_id | gender | age | consumption_status | sleeping_status | height_cm | weight_kg | |
|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | 51 | 3 | 1 | 178.57 | 66.28 |
| 1 | 2 | 1 | 61 | 2 | 3 | 182.36 | 88.97 |
| 2 | 3 | 0 | 42 | 3 | 2 | 170.24 | 69.74 |
| 3 | 4 | 0 | 23 | 1 | 4 | 158.38 | 60.75 |
| 4 | 5 | 0 | 34 | 3 | 2 | 166.09 | 71.10 |
Feature Scaling
df_model = df.copy()
df_model.drop(columns='user_id',inplace= True)
#Scaling
num_cols = ['age', 'consumption_status', 'sleeping_status',
'height_cm', 'weight_kg']
for i in num_cols:
scaler = StandardScaler().fit(df_model[[i]])
df_model[i] = scaler.transform(df_model[[i]])
df_model.head(5)
| gender | age | consumption_status | sleeping_status | height_cm | weight_kg | |
|---|---|---|---|---|---|---|
| 0 | 1 | 0.431921 | 1.217750 | -1.372628 | 0.604829 | 0.067677 |
| 1 | 1 | 0.868823 | -0.001219 | 0.015969 | 0.765717 | 1.121053 |
| 2 | 0 | 0.038709 | 1.217750 | -0.678330 | 0.251214 | 0.228306 |
| 3 | 0 | -0.791404 | -1.220188 | 0.710268 | -0.252253 | -0.189052 |
| 4 | 0 | -0.310812 | 1.217750 | -0.678330 | 0.075043 | 0.291444 |
Dimension reduction by using PCAļ¼
pca = PCA(n_components=3)
pca.fit(df_model)
PCA_df = pd.DataFrame(pca.transform(df_model), columns=(["pca_1","pca_2", "pca_3"]))
PCA_df.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| pca_1 | 1000.0 | 3.375078e-17 | 1.518887 | -2.248485 | -0.969409 | -0.401735 | 0.376758 | 4.607408 |
| pca_2 | 1000.0 | -6.328271e-18 | 1.019621 | -1.960823 | -0.830785 | 0.002051 | 0.815201 | 1.968591 |
| pca_3 | 1000.0 | -1.008083e-16 | 0.982167 | -1.924781 | -0.794694 | 0.000407 | 0.826620 | 1.997339 |
plot of the initial data after PCA:
import matplotlib.pyplot as plt
from matplotlib import colors
import seaborn as sns
#A 3D Projection Of Data In The Reduced Dimension
x =PCA_df["pca_1"]
y =PCA_df["pca_2"]
z =PCA_df["pca_3"]
#To plot
fig = plt.figure(figsize=(10,8))
ax = fig.add_subplot(111, projection="3d")
ax.scatter(x,y,z, c="maroon", marker="o" )
ax.set_title("A 3D Projection Of Data In The Reduced Dimension")
plt.show()
Elbow Method to determine the number of clusters to be formed
from yellowbrick.cluster import KElbowVisualizer
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
# Using elbow method to find numbers of clusters to make.
print('Elbow Method to determine the number of clusters to be formed:')
Elbow_M = KElbowVisualizer(KMeans(), k=15)
Elbow_M.fit(PCA_df)
Elbow_M.show()
Elbow Method to determine the number of clusters to be formed:
<AxesSubplot:title={'center':'Distortion Score Elbow for KMeans Clustering'}, xlabel='k', ylabel='distortion score'>
So we use 5 as the number of clusters to do the following steps
#Initiating the Agglomerative Clustering model
AC = AgglomerativeClustering(n_clusters=5, affinity = 'euclidean', linkage ='average')
# fit model and predict clusters
yhat_AC = AC.fit_predict(PCA_df)
PCA_df["clusters_hier"] = yhat_AC
#Adding the Clusters feature to the orignal dataframe.
df["clusters_hier"]= yhat_AC
df.head()
| user_id | gender | age | consumption_status | sleeping_status | height_cm | weight_kg | clusters_hier | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | 51 | 3 | 1 | 178.57 | 66.28 | 0 |
| 1 | 2 | 1 | 61 | 2 | 3 | 182.36 | 88.97 | 0 |
| 2 | 3 | 0 | 42 | 3 | 2 | 170.24 | 69.74 | 0 |
| 3 | 4 | 0 | 23 | 1 | 4 | 158.38 | 60.75 | 1 |
| 4 | 5 | 0 | 34 | 3 | 2 | 166.09 | 71.10 | 0 |
Plot:
import plotly as py
import plotly.graph_objs as go
# 3d scatterplot using plotly
Scene = dict(xaxis = dict(title = 'pca_1 -->'),yaxis = dict(title = 'pca_2--->'),
zaxis = dict(title = 'pca_3-->'))
# AC.labels_ is nothing but the predicted clusters
x = PCA_df[['pca_1','pca_2','pca_3']].values
labels = AC.labels_
trace = go.Scatter3d(x=x[:, 0], y=x[:, 1], z=x[:, 2], mode='markers',marker=dict(color = labels, size= 10, line=dict(color= 'black',width = 10)))
layout = go.Layout(title= 'Clusters using Agglomerative Clustering',margin=dict(l=0,r=0),scene = Scene,height = 800,width = 800)
data = [trace]
fig = go.Figure(data = data, layout = layout)
fig.show()
KM = KMeans(n_clusters = 5, init = "k-means++", max_iter = 300, n_init = 10, random_state = 0)
y_KM = KM.fit_predict(PCA_df)
PCA_df["clusters_km"] = y_KM
#Adding the Clusters feature to the orignal dataframe.
df["clusters_km"]= y_KM
Plot:
# 3d scatterplot using plotly
Scene = dict(xaxis = dict(title = 'pca_1 -->'),yaxis = dict(title = 'pca_2--->'),
zaxis = dict(title = 'pca_3-->'))
# AC.labels_ is nothing but the predicted clusters
x = PCA_df[['pca_1','pca_2','pca_3']].values
labels = KM.labels_
trace = go.Scatter3d(x=x[:, 0], y=x[:, 1], z=x[:, 2], mode='markers',marker=dict(color = labels, size= 10, line=dict(color= 'black',width = 10)))
layout = go.Layout(title= 'Clusters using Kmeans',margin=dict(l=0,r=0),scene = Scene,height = 800,width = 800)
data = [trace]
fig = go.Figure(data = data, layout = layout)
fig.show()
from sklearn.neighbors import NearestNeighbors # importing the library
neighb = NearestNeighbors(n_neighbors=2) # creating an object of the NearestNeighbors class
nbrs=neighb.fit(x) # fitting the data to the object
distances,indices=nbrs.kneighbors(PCA_df[['pca_1','pca_2','pca_3']])
# Sort and plot the distances results
distances = np.sort(distances, axis = 0) # sorting the distances
distances = distances[:, 1] # taking the second column of the sorted distances
plt.rcParams['figure.figsize'] = (5,3) # setting the figure size
plt.plot(distances) # plotting the distances
plt.show() # showing the plot
/opt/anaconda3/lib/python3.9/site-packages/sklearn/base.py:443: UserWarning: X has feature names, but NearestNeighbors was fitted without feature names
db = DBSCAN(eps=0.3, min_samples=6, metric='euclidean', algorithm='auto', leaf_size=30)
y_db = db.fit_predict(PCA_df)
PCA_df["clusters_DBSCAN"] = y_db
#Adding the Clusters feature to the orignal dataframe.
df["clusters_DBSCAN"]= y_db
np.unique(db.labels_)
array([-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20])
# 3d scatterplot using plotly
Scene = dict(xaxis = dict(title = 'pca_1 -->'),yaxis = dict(title = 'pca_2--->'),
zaxis = dict(title = 'pca_3-->'))
# AC.labels_ is nothing but the predicted clusters
x = PCA_df[['pca_1','pca_2','pca_3']].values
labels = db.labels_
trace = go.Scatter3d(x=x[:, 0], y=x[:, 1], z=x[:, 2], mode='markers',marker=dict(color = labels, size= 10, line=dict(color= 'black',width = 10)))
layout = go.Layout(title= 'Clusters using DBSCAN',margin=dict(l=0,r=0),scene = Scene,height = 800,width = 800)
data = [trace]
fig = go.Figure(data = data, layout = layout)
fig.show()
df.to_csv('personal_information_labeled.csv',index=False)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# load the dataset (dataframe)
df = pd.read_csv('/Users/peiyicai/Desktop/personal_information_labeled.csv', encoding='utf-8')
df.head(12)
| user_id | gender | age | consumption_status | sleeping_status | height_cm | weight_kg | clusters_hier | clusters_km | clusters_DBSCAN | pred_food_list | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | 79 | 1 | 4 | 174.03 | 83.01 | 1 | 0 | 0 | Quesadilla, apples, bagels, chips, haiku roll,... |
| 1 | 2 | 0 | 74 | 2 | 4 | 168.05 | 67.93 | 1 | 1 | 1 | jerky, celery, clams, catfish, cake, bison, ki... |
| 2 | 3 | 0 | 39 | 1 | 4 | 164.47 | 59.86 | 1 | 0 | 0 | donuts, Pepperoni, granola, applesauce, dates,... |
| 3 | 4 | 0 | 40 | 1 | 3 | 177.81 | 55.27 | 1 | 0 | 2 | eggs, Reuben, Tater tots, Ziti, Wine, jambalay... |
| 4 | 5 | 0 | 15 | 1 | 5 | 169.12 | 66.82 | 1 | 0 | 3 | avacado, cookies, almond, English muffins, cak... |
| 5 | 6 | 1 | 7 | 1 | 1 | 126.16 | 22.44 | 4 | 4 | -1 | jambalaya, bison, grits, cake, chicken, gumbo,... |
| 6 | 7 | 1 | 75 | 3 | 5 | 177.44 | 85.26 | 2 | 1 | 7 | Walnuts, Wine, jerky, ketchup, bagels, franks,... |
| 7 | 8 | 0 | 8 | 1 | 2 | 122.28 | 25.29 | 0 | 4 | 21 | hummus, goose, duck, Ostrich, kale, artichoke,... |
| 8 | 9 | 1 | 43 | 3 | 1 | 172.49 | 95.49 | 2 | 2 | 4 | ham, arugala, Noodles, buritto, alfalfa, Yogur... |
| 9 | 10 | 1 | 68 | 3 | 3 | 181.15 | 70.93 | 2 | 1 | 5 | French dip, Venison, gnocchi, ginger, celery, ... |
| 10 | 11 | 0 | 48 | 3 | 3 | 165.69 | 64.15 | 2 | 1 | 5 | French toast, Milkshake, Lasagna, Graham crack... |
| 11 | 12 | 0 | 55 | 1 | 3 | 168.89 | 69.93 | 1 | 0 | 2 | apples, bread, grits, antelope, broccoli, avac... |
#predict user's
input_user= df[df['user_id']<6].drop(['clusters_hier','clusters_km','clusters_DBSCAN','pred_food_list'],axis=1)
y_KM_user1 = KM.fit_predict(input_user)
y_KM_user1
array([3, 1, 4, 2, 0], dtype=int32)
# get the specific subset(based on label)
#we choose the second user to do recommendation
# and we can see the label of user 2 is 1:
expected_label=1
df_sub=df[df['clusters_km']==expected_label]
df_sub.head(10)
| user_id | gender | age | consumption_status | sleeping_status | height_cm | weight_kg | clusters_hier | clusters_km | clusters_DBSCAN | pred_food_list | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 2 | 0 | 74 | 2 | 4 | 168.05 | 67.93 | 1 | 1 | 1 | jerky, celery, clams, catfish, cake, bison, ki... |
| 6 | 7 | 1 | 75 | 3 | 5 | 177.44 | 85.26 | 2 | 1 | 7 | Walnuts, Wine, jerky, ketchup, bagels, franks,... |
| 9 | 10 | 1 | 68 | 3 | 3 | 181.15 | 70.93 | 2 | 1 | 5 | French dip, Venison, gnocchi, ginger, celery, ... |
| 10 | 11 | 0 | 48 | 3 | 3 | 165.69 | 64.15 | 2 | 1 | 5 | French toast, Milkshake, Lasagna, Graham crack... |
| 13 | 14 | 1 | 41 | 3 | 3 | 183.06 | 86.25 | 2 | 1 | 5 | Ostrich, kingfish, Quesadilla, French dip, Pan... |
| 14 | 15 | 1 | 59 | 3 | 4 | 167.35 | 85.02 | 2 | 1 | 6 | Lamb, ham, carne asada, franks, catfish, cooki... |
| 15 | 16 | 0 | 39 | 3 | 5 | 167.75 | 61.66 | 2 | 1 | 7 | coffee, hamburger (and cheeseburgers and bacon... |
| 17 | 18 | 0 | 56 | 3 | 3 | 170.06 | 69.69 | 2 | 1 | 5 | Venison, fajita, Porter, barley, French dip, Q... |
| 21 | 22 | 1 | 36 | 2 | 5 | 184.28 | 97.44 | 2 | 1 | 9 | Linguine, ketchup, BBQ, Lasagna, Moose, huenos... |
| 24 | 25 | 0 | 13 | 3 | 3 | 154.22 | 40.12 | 1 | 1 | -1 | cookies, bacon, Irish stew, hamburger (and che... |
# get the index of id
def index_from_id(df,id):
return df[df['user_id']==id].index.values[0]
# id: the one you want to match, it can be student id or company id, if it is student, then retrieve all companys skillset, and create a dataframe,(match company for a given student)
# and if it is company id, then retrieve all students skillset, and create a dataframe(match students for a given company)
# df: dataframe needed to be make
#cosine_similarity_matrix: converted from dataframe[keyword] column
#number_of_recommendations: the number you want to match
def recommendations( id, df,cosine_similarity_matrix,number_of_recommendations):
index = index_from_id(df,id)
similarity_scores = list(enumerate(cosine_similarity_matrix[index]))
similarity_scores_sorted = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
recommendations_indices = [t[0] for t in similarity_scores_sorted[1:(number_of_recommendations+1)]]
# this could be df['ID']
return df[['user_id','pred_food_list']].iloc[recommendations_indices]
vect = CountVectorizer(stop_words='english')
# transform df['key word'] into matrix
vect_matrix = vect.fit_transform(df_sub['pred_food_list'])
vect_matrix
# get the cosine_similarity_matrix
cosine_similarity_matrix_count_based = cosine_similarity(vect_matrix, vect_matrix)
recommendations(2, df_sub,cosine_similarity_matrix_count_based,10)
| user_id | pred_food_list | |
|---|---|---|
| 827 | 828 | cupcakes, hot dogs, kale, Yogurt, granola, ice... |
| 795 | 796 | franks, chowder, antelope, jerky, Cabbage, Moo... |
| 983 | 984 | chowder, ham, kale, jelly / jam, chocolate, La... |
| 679 | 680 | kabobs, bread, Pizza, BBQ, Spaghetti, Quesadil... |
| 951 | 952 | huenos rancheros, apples, dips, bagels, Wine, ... |
| 302 | 303 | Quiche, Ziti, French dip, celery, Moose, fondu... |
| 978 | 979 | barley, apples, asparagus, jalapeƱo, almond, b... |
| 989 | 990 | cupcakes, ham, bread, Avocado roll, fondu, chi... |
| 191 | 192 | Wine, cupcakes, arugala, black beans, Venison,... |
| 254 | 255 | Pizza, dumplings, Ostrich, Spaghetti, artichok... |